import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
creditcard_df = pd.read_csv('Marketing_data.csv')
creditcard_df.head()
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
creditcard_df.shape
(8950, 18)
creditcard_df.columns
Index(['CUST_ID', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
dtype='object')
creditcard_df.dtypes
CUST_ID object BALANCE float64 BALANCE_FREQUENCY float64 PURCHASES float64 ONEOFF_PURCHASES float64 INSTALLMENTS_PURCHASES float64 CASH_ADVANCE float64 PURCHASES_FREQUENCY float64 ONEOFF_PURCHASES_FREQUENCY float64 PURCHASES_INSTALLMENTS_FREQUENCY float64 CASH_ADVANCE_FREQUENCY float64 CASH_ADVANCE_TRX int64 PURCHASES_TRX int64 CREDIT_LIMIT float64 PAYMENTS float64 MINIMUM_PAYMENTS float64 PRC_FULL_PAYMENT float64 TENURE int64 dtype: object
creditcard_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8950 entries, 0 to 8949 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUST_ID 8950 non-null object 1 BALANCE 8950 non-null float64 2 BALANCE_FREQUENCY 8950 non-null float64 3 PURCHASES 8950 non-null float64 4 ONEOFF_PURCHASES 8950 non-null float64 5 INSTALLMENTS_PURCHASES 8950 non-null float64 6 CASH_ADVANCE 8950 non-null float64 7 PURCHASES_FREQUENCY 8950 non-null float64 8 ONEOFF_PURCHASES_FREQUENCY 8950 non-null float64 9 PURCHASES_INSTALLMENTS_FREQUENCY 8950 non-null float64 10 CASH_ADVANCE_FREQUENCY 8950 non-null float64 11 CASH_ADVANCE_TRX 8950 non-null int64 12 PURCHASES_TRX 8950 non-null int64 13 CREDIT_LIMIT 8949 non-null float64 14 PAYMENTS 8950 non-null float64 15 MINIMUM_PAYMENTS 8637 non-null float64 16 PRC_FULL_PAYMENT 8950 non-null float64 17 TENURE 8950 non-null int64 dtypes: float64(14), int64(3), object(1) memory usage: 1.2+ MB
creditcard_df.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
for col in list(creditcard_df):
print(col)
print(creditcard_df[col].unique())
CUST_ID ['C10001' 'C10002' 'C10003' ... 'C19188' 'C19189' 'C19190'] BALANCE [ 40.900749 3202.467416 2495.148862 ... 23.398673 13.457564 372.708075] BALANCE_FREQUENCY [0.818182 0.909091 1. 0.636364 0.545455 0.875 0.454545 0.727273 0.5 0.888889 0.090909 0.272727 0.363636 0. 0.666667 0.75 0.857143 0.181818 0.333333 0.6 0.3 0.125 0.9 0.833333 0.8 0.2 0.777778 0.555556 0.25 0.142857 0.571429 0.4 0.444444 0.714286 0.222222 0.1 0.625 0.428571 0.111111 0.285714 0.7 0.375 0.166667] PURCHASES [ 95.4 0. 773.17 ... 291.12 144.4 1093.25] ONEOFF_PURCHASES [ 0. 773.17 1499. ... 734.4 1012.73 1093.25] INSTALLMENTS_PURCHASES [ 95.4 0. 1333.28 ... 113.28 291.12 144.4 ] CASH_ADVANCE [ 0. 6442.945483 205.788017 ... 8555.409326 36.558778 127.040008] PURCHASES_FREQUENCY [0.166667 0. 1. 0.083333 0.666667 0.333333 0.25 0.75 0.5 0.416667 0.916667 0.583333 0.375 0.625 0.272727 0.833333 0.909091 0.111111 0.142857 0.090909 0.363636 0.1 0.875 0.125 0.818182 0.636364 0.2 0.8 0.3 0.9 0.285714 0.727273 0.181818 0.7 0.545455 0.888889 0.714286 0.454545 0.857143 0.555556 0.428571 0.4 0.571429 0.6 0.222222 0.777778 0.444444] ONEOFF_PURCHASES_FREQUENCY [0. 1. 0.083333 0.166667 0.25 0.916667 0.5 0.416667 0.333333 0.666667 0.375 0.583333 0.1 0.090909 0.833333 0.75 0.111111 0.142857 0.125 0.875 0.363636 0.2 0.818182 0.8 0.3 0.636364 0.181818 0.909091 0.285714 0.222222 0.727273 0.571429 0.6 0.272727 0.714286 0.545455 0.428571 0.444444 0.454545 0.625 0.777778 0.555556 0.7 0.9 0.4 0.857143 0.888889] PURCHASES_INSTALLMENTS_FREQUENCY [0.083333 0. 0.583333 1. 0.25 0.916667 0.75 0.5 0.333333 0.666667 0.416667 0.166667 0.833333 0.4 0.181818 0.818182 0.272727 0.375 0.125 0.636364 0.545455 0.909091 0.888889 0.2 0.8 0.1 0.142857 0.857143 0.444444 0.454545 0.111111 0.6 0.555556 0.777778 0.3 0.9 0.363636 0.714286 0.875 0.222222 0.285714 0.7 0.727273 0.571429 0.090909 0.428571 0.625 ] CASH_ADVANCE_FREQUENCY [0. 0.25 0.083333 0.166667 0.333333 0.363636 0.833333 0.5 0.727273 0.125 0.875 0.111111 0.416667 0.181818 0.545455 0.75 0.142857 0.583333 0.666667 0.222222 0.285714 0.909091 0.2 0.625 0.090909 0.4 1. 0.8 0.636364 0.3 0.916667 0.444444 1.25 0.1 0.428571 0.272727 0.555556 0.6 0.454545 1.166667 0.375 0.777778 0.714286 0.571429 0.857143 1.125 1.1 1.5 0.7 0.818182 0.9 0.888889 1.090909 1.142857] CASH_ADVANCE_TRX [ 0 4 1 3 6 13 5 16 10 2 11 7 12 37 27 23 21 14 40 8 9 26 15 18 28 24 20 17 22 31 123 52 51 62 19 25 30 29 53 45 43 42 107 38 56 39 32 33 50 34 63 36 110 47 48 71 35 93 80 44 61 46 49 69 41] PURCHASES_TRX [ 2 0 12 1 8 64 5 3 6 26 11 9 92 17 13 45 14 41 27 20 87 18 4 42 61 33 7 50 22 23 60 46 75 31 10 34 81 25 85 217 19 52 216 97 24 77 130 90 44 39 15 30 36 123 151 21 101 49 98 28 84 93 72 38 99 62 48 16 32 51 74 29 59 76 47 126 229 40 103 121 157 114 53 83 43 54 222 66 141 37 79 70 80 194 117 100 111 67 219 55 152 104 182 88 82 71 78 35 122 105 108 69 175 135 91 65 68 119 63 140 113 358 58 248 129 56 89 57 139 176 136 195 73 109 208 115 110 147 273 102 185 171 168 232 95 86 148 112 128 254 198 298 154 116 142 131 347 204 200 94 118 344 162 308 199 309 96 274 224 143 133 127 186] CREDIT_LIMIT [ 1000. 7000. 7500. 1200. 1800. 13500. 2300. 11000. 2000. 3000. 8000. 2500. 13000. 4000. 11250. 9000. 6000. 1700. 10500. 6900. 5000. 4500. 1500. 8500. 2400. 4200. 3300. 12000. 3500. 6500. 1600. 4150. 1850. 6250. 2250. 9500. 16500. 5700. 5500. 17000. 3200. 19000. 2800. 18000. 21500. 10000. 20000. 7900. 15000. 12500. 14000. 5300. 900. 6150. 11500. 23000. 2700. 14500. 19500. 1400. 2900. 1950. 7200. 10950. 2100. 500. 21000. 4800. 7300. 18500. 6550. 6200. 3150. 11800. 11300. 2750. 10300. 2850. 17500. 22500. 6750. 16000. 1750. 15500. 3800. 6600. 3650. 2600. 4600. 3350. 4300. 5250. 28000. 3600. 10200. 750. 7600. 2720. 2200. 7350. 300. 3750. 1900. 10400. 5200. 4350. 3050. 6700. 20500. 5750. 8300. 7950. 3100. 4050. 6300. 3666.666667 200. 9100. 3700. 7100. 4750. 3900. 10100. 1300. 19600. 10450. 11150. 600. 4700. 150. 3400. 1350. 5100. 7227.272727 13550. 9200. 6100. 2050. 6400. 4250. 14600. 8600. 3511.111111 5600. 6727.272727 13600. 8800. 11100. 12200. 8100. 8050. 4100. 5550. 4450. 5650. 8200. 5400. 25000. 9950. 1150. 6800. 4650. 7800. 2150. 22000. 400. 7700. 4400. 1050. 700. 5181.818182 4900. 1100. 2550. 2283.333333 30000. 9700. 9400. nan 2725. 450. 8700. 12300. 7050. 1550. 9800. 5450. 800. 8954.545455 2950. 2450. 6850. 4950. 5900. 1250. 17150. 9600. 50. 10750. 1120. 13450. 3777.777778 650. 1450. 2350. 1833.333333] PAYMENTS [ 201.802084 4103.032597 622.066742 ... 81.270775 52.549959 63.165404] MINIMUM_PAYMENTS [ 139.509787 1072.340217 627.284787 ... 82.418369 55.755628 88.288956] PRC_FULL_PAYMENT [0. 0.222222 1. 0.25 0.083333 0.3 0.333333 0.166667 0.111111 0.916667 0.2 0.090909 0.454545 0.181818 0.444444 0.636364 0.5 0.75 0.142857 0.888889 0.545455 0.818182 0.363636 0.833333 0.666667 0.909091 0.1 0.583333 0.8 0.416667 0.4 0.125 0.714286 0.6 0.571429 0.375 0.9 0.285714 0.7 0.272727 0.777778 0.875 0.727273 0.428571 0.625 0.857143 0.555556] TENURE [12 8 11 9 10 7 6]
for col in list(creditcard_df):
print(col, creditcard_df[col].nunique(), sep='--->')
CUST_ID--->8950 BALANCE--->8871 BALANCE_FREQUENCY--->43 PURCHASES--->6203 ONEOFF_PURCHASES--->4014 INSTALLMENTS_PURCHASES--->4452 CASH_ADVANCE--->4323 PURCHASES_FREQUENCY--->47 ONEOFF_PURCHASES_FREQUENCY--->47 PURCHASES_INSTALLMENTS_FREQUENCY--->47 CASH_ADVANCE_FREQUENCY--->54 CASH_ADVANCE_TRX--->65 PURCHASES_TRX--->173 CREDIT_LIMIT--->205 PAYMENTS--->8711 MINIMUM_PAYMENTS--->8636 PRC_FULL_PAYMENT--->47 TENURE--->7
creditcard_df.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
creditcard_df[creditcard_df['PURCHASES'] == 49039.570000]
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 550 | C10574 | 11547.52001 | 1.0 | 49039.57 | 40761.25 | 8278.32 | 558.166886 | 1.0 | 1.0 | 0.916667 | 0.083333 | 1 | 101 | 22500.0 | 46930.59824 | 2974.069421 | 0.25 | 12 |
creditcard_df[creditcard_df['CASH_ADVANCE'] == 0.000000]
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.0 | 0.166667 | 0.000000 | 0.083333 | 0.0 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.00 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.0 | 1.000000 | 1.000000 | 0.000000 | 0.0 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.00 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.0 | 0.083333 | 0.083333 | 0.000000 | 0.0 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.00 | 12 |
| 5 | C10006 | 1809.828751 | 1.000000 | 1333.28 | 0.00 | 1333.28 | 0.0 | 0.666667 | 0.000000 | 0.583333 | 0.0 | 0 | 8 | 1800.0 | 1400.057770 | 2407.246035 | 0.00 | 12 |
| 6 | C10007 | 627.260806 | 1.000000 | 7091.01 | 6402.63 | 688.38 | 0.0 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0 | 64 | 13500.0 | 6354.314328 | 198.065894 | 1.00 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8943 | C19184 | 5.871712 | 0.500000 | 20.90 | 20.90 | 0.00 | 0.0 | 0.166667 | 0.166667 | 0.000000 | 0.0 | 0 | 1 | 500.0 | 58.644883 | 43.473717 | 0.00 | 6 |
| 8944 | C19185 | 193.571722 | 0.833333 | 1012.73 | 1012.73 | 0.00 | 0.0 | 0.333333 | 0.333333 | 0.000000 | 0.0 | 0 | 2 | 4000.0 | 0.000000 | NaN | 0.00 | 6 |
| 8945 | C19186 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.0 | 1.000000 | 0.000000 | 0.833333 | 0.0 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.50 | 6 |
| 8946 | C19187 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.0 | 1.000000 | 0.000000 | 0.833333 | 0.0 | 0 | 6 | 1000.0 | 275.861322 | NaN | 0.00 | 6 |
| 8947 | C19188 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.0 | 0.833333 | 0.000000 | 0.666667 | 0.0 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.25 | 6 |
4628 rows × 18 columns
creditcard_df.isnull().sum()
CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
sb.heatmap(creditcard_df.isnull(), cmap='Oranges_r', yticklabels=False)
<AxesSubplot:>
mean_value = creditcard_df['MINIMUM_PAYMENTS'].mean()
mean_value
864.2065423050816
creditcard_df['MINIMUM_PAYMENTS'].fillna(value=mean_value, inplace=True)
mean_value = creditcard_df['CREDIT_LIMIT'].mean()
mean_value
4494.043836631879
creditcard_df['CREDIT_LIMIT'].fillna(value=mean_value, inplace=True)
creditcard_df.isnull().sum()
CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
sb.heatmap(creditcard_df.isnull(), cmap='Oranges_r', yticklabels=False)
<AxesSubplot:>
creditcard_df.duplicated().sum()
0
creditcard_df.drop('CUST_ID', axis = 1,inplace=True)
creditcard_df.columns
Index(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
'TENURE'],
dtype='object')
Hist and KDE(Kernal Density Estimate ) plots
plt.figure(figsize=(10,50))
for i in range(len(creditcard_df.columns)):
plt.subplot(17, 1, i+1)
sb.distplot(creditcard_df[creditcard_df.columns[i]],kde_kws={"color": "g", "lw": 2, "label": "KDE"}, hist_kws={"color": "r"})
plt.title(creditcard_df.columns[i], color = 'b',size = 10)
plt.tight_layout()
plt.show()
C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Laxman Prasad\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sb.pairplot(creditcard_df)
# Correlation between 'PURCHASES' and ONEOFF_PURCHASES & INSTALMENT_PURCHASES
# Trend between 'PURCHASES' and 'CREDIT_LIMIT' & 'PAYMENTS'
<seaborn.axisgrid.PairGrid at 0x2b2be39e160>
corr = creditcard_df.corr()
corr
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BALANCE | 1.000000 | 0.322412 | 0.181261 | 0.164350 | 0.126469 | 0.496692 | -0.077944 | 0.073166 | -0.063186 | 0.449218 | 0.385152 | 0.154338 | 0.531320 | 0.322802 | 0.394282 | -0.318959 | 0.072692 |
| BALANCE_FREQUENCY | 0.322412 | 1.000000 | 0.133674 | 0.104323 | 0.124292 | 0.099388 | 0.229715 | 0.202415 | 0.176079 | 0.191873 | 0.141555 | 0.189626 | 0.096124 | 0.065008 | 0.114249 | -0.095082 | 0.119776 |
| PURCHASES | 0.181261 | 0.133674 | 1.000000 | 0.916845 | 0.679896 | -0.051474 | 0.393017 | 0.498430 | 0.315567 | -0.120143 | -0.067175 | 0.689561 | 0.356991 | 0.603264 | 0.093515 | 0.180379 | 0.086288 |
| ONEOFF_PURCHASES | 0.164350 | 0.104323 | 0.916845 | 1.000000 | 0.330622 | -0.031326 | 0.264937 | 0.524891 | 0.127729 | -0.082628 | -0.046212 | 0.545523 | 0.319743 | 0.567292 | 0.048597 | 0.132763 | 0.064150 |
| INSTALLMENTS_PURCHASES | 0.126469 | 0.124292 | 0.679896 | 0.330622 | 1.000000 | -0.064244 | 0.442418 | 0.214042 | 0.511351 | -0.132318 | -0.073999 | 0.628108 | 0.256533 | 0.384084 | 0.131687 | 0.182569 | 0.086143 |
| CASH_ADVANCE | 0.496692 | 0.099388 | -0.051474 | -0.031326 | -0.064244 | 1.000000 | -0.215507 | -0.086754 | -0.177070 | 0.628522 | 0.656498 | -0.075850 | 0.304008 | 0.453238 | 0.139223 | -0.152935 | -0.068312 |
| PURCHASES_FREQUENCY | -0.077944 | 0.229715 | 0.393017 | 0.264937 | 0.442418 | -0.215507 | 1.000000 | 0.501343 | 0.862934 | -0.308478 | -0.203478 | 0.568430 | 0.119908 | 0.103464 | 0.002976 | 0.305802 | 0.061506 |
| ONEOFF_PURCHASES_FREQUENCY | 0.073166 | 0.202415 | 0.498430 | 0.524891 | 0.214042 | -0.086754 | 0.501343 | 1.000000 | 0.142329 | -0.111716 | -0.069088 | 0.544869 | 0.295090 | 0.243537 | -0.029963 | 0.157531 | 0.082466 |
| PURCHASES_INSTALLMENTS_FREQUENCY | -0.063186 | 0.176079 | 0.315567 | 0.127729 | 0.511351 | -0.177070 | 0.862934 | 0.142329 | 1.000000 | -0.262958 | -0.169207 | 0.529975 | 0.060851 | 0.085551 | 0.029590 | 0.250087 | 0.073275 |
| CASH_ADVANCE_FREQUENCY | 0.449218 | 0.191873 | -0.120143 | -0.082628 | -0.132318 | 0.628522 | -0.308478 | -0.111716 | -0.262958 | 1.000000 | 0.799561 | -0.131168 | 0.132591 | 0.183192 | 0.097898 | -0.249773 | -0.133372 |
| CASH_ADVANCE_TRX | 0.385152 | 0.141555 | -0.067175 | -0.046212 | -0.073999 | 0.656498 | -0.203478 | -0.069088 | -0.169207 | 0.799561 | 1.000000 | -0.066157 | 0.149728 | 0.255278 | 0.109185 | -0.169784 | -0.043421 |
| PURCHASES_TRX | 0.154338 | 0.189626 | 0.689561 | 0.545523 | 0.628108 | -0.075850 | 0.568430 | 0.544869 | 0.529975 | -0.131168 | -0.066157 | 1.000000 | 0.272928 | 0.370832 | 0.095858 | 0.162066 | 0.121874 |
| CREDIT_LIMIT | 0.531320 | 0.096124 | 0.356991 | 0.319743 | 0.256533 | 0.304008 | 0.119908 | 0.295090 | 0.060851 | 0.132591 | 0.149728 | 0.272928 | 1.000000 | 0.421895 | 0.125167 | 0.055727 | 0.139486 |
| PAYMENTS | 0.322802 | 0.065008 | 0.603264 | 0.567292 | 0.384084 | 0.453238 | 0.103464 | 0.243537 | 0.085551 | 0.183192 | 0.255278 | 0.370832 | 0.421895 | 1.000000 | 0.125046 | 0.112138 | 0.106136 |
| MINIMUM_PAYMENTS | 0.394282 | 0.114249 | 0.093515 | 0.048597 | 0.131687 | 0.139223 | 0.002976 | -0.029963 | 0.029590 | 0.097898 | 0.109185 | 0.095858 | 0.125167 | 0.125046 | 1.000000 | -0.139674 | 0.057257 |
| PRC_FULL_PAYMENT | -0.318959 | -0.095082 | 0.180379 | 0.132763 | 0.182569 | -0.152935 | 0.305802 | 0.157531 | 0.250087 | -0.249773 | -0.169784 | 0.162066 | 0.055727 | 0.112138 | -0.139674 | 1.000000 | -0.016486 |
| TENURE | 0.072692 | 0.119776 | 0.086288 | 0.064150 | 0.086143 | -0.068312 | 0.061506 | 0.082466 | 0.073275 | -0.133372 | -0.043421 | 0.121874 | 0.139486 | 0.106136 | 0.057257 | -0.016486 | 1.000000 |
plt.figure(figsize=(15,15))
sb.heatmap(creditcard_df.corr(), yticklabels=True, cmap = 'Accent_r', annot=True)
<AxesSubplot:>
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
creditcard_df_scaled= scaler.fit_transform(creditcard_df)
creditcard_df_scaled
array([[-0.73198937, -0.24943448, -0.42489974, ..., -0.31096755,
-0.52555097, 0.36067954],
[ 0.78696085, 0.13432467, -0.46955188, ..., 0.08931021,
0.2342269 , 0.36067954],
[ 0.44713513, 0.51808382, -0.10766823, ..., -0.10166318,
-0.52555097, 0.36067954],
...,
[-0.7403981 , -0.18547673, -0.40196519, ..., -0.33546549,
0.32919999, -4.12276757],
[-0.74517423, -0.18547673, -0.46955188, ..., -0.34690648,
0.32919999, -4.12276757],
[-0.57257511, -0.88903307, 0.04214581, ..., -0.33294642,
-0.52555097, -4.12276757]])
creditcard_df_scaled.shape
(8950, 17)
from sklearn.cluster import KMeans
wcss = []
for i in range(1,20):
Kmeans = KMeans(n_clusters=i)
Kmeans.fit(creditcard_df_scaled)
wcss.append(Kmeans.inertia_)
plt.plot(wcss, 'r*-', markersize=10)
plt.title('Plot to find number of clusters', color = 'Red', size = 15)
plt.xlabel('Clusters', color = 'Blue', size = 12)
plt.ylabel('wcss scores', color = 'Blue', size = 12)
plt.show()
Kmeans = KMeans(8,init='k-means++')
#intit is to avoid taking the model centre point as centroid
Kmeans.fit(creditcard_df_scaled)
KMeans()
labels = Kmeans.labels_
labels.shape
(8950,)
labels.min()
0
labels.max()
7
Kmeans.labels_
array([1, 6, 5, ..., 2, 2, 2])
Kmeans.labels_.dtype
dtype('int32')
Kmeans.labels_
array([1, 6, 5, ..., 2, 2, 2])
Kmeans.cluster_centers_
array([[-3.36049587e-01, -3.47077860e-01, -2.89266864e-01,
-2.15966042e-01, -2.86835279e-01, 6.82835664e-02,
-2.03077759e-01, -2.88661118e-01, -2.24548914e-01,
3.08663197e-01, 9.96472666e-04, -3.88116768e-01,
-5.68620439e-01, -3.92679966e-01, -2.09145007e-01,
1.40113780e-02, -3.20373299e+00],
[-7.01229283e-01, -2.14411628e+00, -3.11098786e-01,
-2.35720118e-01, -3.02413627e-01, -3.21904751e-01,
-5.56586290e-01, -4.44988775e-01, -4.39729749e-01,
-5.20843751e-01, -3.76102518e-01, -4.19789965e-01,
-1.77040035e-01, -2.02048177e-01, -2.56658204e-01,
2.81550499e-01, 1.99199458e-01],
[ 1.88267000e-02, 4.03379486e-01, -3.62129153e-01,
-2.47327073e-01, -4.01753170e-01, -8.93531552e-02,
-8.67254702e-01, -4.10337326e-01, -7.58633418e-01,
1.12552518e-01, -2.25036977e-02, -4.87082010e-01,
-3.05667743e-01, -2.49499439e-01, -3.67394168e-03,
-4.57045384e-01, 2.72461662e-01],
[-3.66739830e-01, 3.33338380e-01, -3.85389595e-02,
-2.44302992e-01, 3.57516154e-01, -3.63618503e-01,
9.90253126e-01, -3.87060270e-01, 1.20549554e+00,
-4.75146398e-01, -3.61199635e-01, 1.86768973e-01,
-2.61189644e-01, -2.17939671e-01, -4.59793867e-02,
3.14958533e-01, 2.57483792e-01],
[ 1.69145526e+00, 3.92898471e-01, -2.17992157e-01,
-1.56311793e-01, -2.28334747e-01, 2.02894159e+00,
-4.77769879e-01, -2.12558400e-01, -4.14961382e-01,
1.92718079e+00, 1.94262279e+00, -2.66771670e-01,
1.03161365e+00, 8.26932696e-01, 5.13220493e-01,
-3.90436041e-01, 7.02783875e-02],
[ 9.19935407e-01, 4.67655287e-01, 2.25449078e+00,
1.72111588e+00, 2.16875635e+00, -1.93345936e-01,
1.16056707e+00, 1.55482581e+00, 1.24634100e+00,
-3.09466958e-01, -2.09799089e-01, 2.78190077e+00,
1.22528374e+00, 1.28391524e+00, 5.66182497e-01,
2.88329286e-01, 3.34055749e-01],
[ 1.92305052e+00, 3.37716867e-01, 1.12120418e+01,
1.06003671e+01, 7.03311838e+00, 4.19625127e-01,
1.04698328e+00, 1.91550149e+00, 9.81333653e-01,
-2.58911831e-01, 6.12285458e-02, 5.36243850e+00,
3.04400629e+00, 8.09897492e+00, 1.12031761e+00,
1.11013157e+00, 3.10863465e-01],
[-1.62727000e-01, 3.92065093e-01, 4.63858580e-01,
6.06423226e-01, -1.74695827e-02, -3.33497066e-01,
9.42521833e-01, 1.87730926e+00, 8.89931125e-02,
-4.08096446e-01, -3.23289849e-01, 5.28626139e-01,
3.83552937e-01, 9.37912723e-02, -1.62173705e-01,
4.07629642e-01, 2.61092550e-01]])
Kmeans.cluster_centers_.shape
(8, 17)
Kmeans.cluster_centers_.size
136
cluster_centers = pd.DataFrame(data = Kmeans.cluster_centers_, columns = [creditcard_df.columns])
cluster_centers
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.336050 | -0.347078 | -0.289267 | -0.215966 | -0.286835 | 0.068284 | -0.203078 | -0.288661 | -0.224549 | 0.308663 | 0.000996 | -0.388117 | -0.568620 | -0.392680 | -0.209145 | 0.014011 | -3.203733 |
| 1 | -0.701229 | -2.144116 | -0.311099 | -0.235720 | -0.302414 | -0.321905 | -0.556586 | -0.444989 | -0.439730 | -0.520844 | -0.376103 | -0.419790 | -0.177040 | -0.202048 | -0.256658 | 0.281550 | 0.199199 |
| 2 | 0.018827 | 0.403379 | -0.362129 | -0.247327 | -0.401753 | -0.089353 | -0.867255 | -0.410337 | -0.758633 | 0.112553 | -0.022504 | -0.487082 | -0.305668 | -0.249499 | -0.003674 | -0.457045 | 0.272462 |
| 3 | -0.366740 | 0.333338 | -0.038539 | -0.244303 | 0.357516 | -0.363619 | 0.990253 | -0.387060 | 1.205496 | -0.475146 | -0.361200 | 0.186769 | -0.261190 | -0.217940 | -0.045979 | 0.314959 | 0.257484 |
| 4 | 1.691455 | 0.392898 | -0.217992 | -0.156312 | -0.228335 | 2.028942 | -0.477770 | -0.212558 | -0.414961 | 1.927181 | 1.942623 | -0.266772 | 1.031614 | 0.826933 | 0.513220 | -0.390436 | 0.070278 |
| 5 | 0.919935 | 0.467655 | 2.254491 | 1.721116 | 2.168756 | -0.193346 | 1.160567 | 1.554826 | 1.246341 | -0.309467 | -0.209799 | 2.781901 | 1.225284 | 1.283915 | 0.566182 | 0.288329 | 0.334056 |
| 6 | 1.923051 | 0.337717 | 11.212042 | 10.600367 | 7.033118 | 0.419625 | 1.046983 | 1.915501 | 0.981334 | -0.258912 | 0.061229 | 5.362438 | 3.044006 | 8.098975 | 1.120318 | 1.110132 | 0.310863 |
| 7 | -0.162727 | 0.392065 | 0.463859 | 0.606423 | -0.017470 | -0.333497 | 0.942522 | 1.877309 | 0.088993 | -0.408096 | -0.323290 | 0.528626 | 0.383553 | 0.093791 | -0.162174 | 0.407630 | 0.261093 |
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [creditcard_df.columns])
cluster_centers
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 865.015978 | 0.795051 | 385.181720 | 233.977974 | 151.686061 | 1122.064941 | 0.408846 | 0.116344 | 0.275196 | 0.196911 | 3.255627 | 5.062701 | 2425.054994 | 596.373827 | 376.802926 | 0.157813 | 7.229904 |
| 1 | 104.925267 | 0.369349 | 338.537483 | 201.190254 | 137.598754 | 303.821813 | 0.266966 | 0.069709 | 0.189677 | 0.030918 | 0.682203 | 4.275424 | 3849.863936 | 1148.234177 | 266.075424 | 0.236063 | 11.783898 |
| 2 | 1603.661014 | 0.972828 | 229.510316 | 181.925086 | 47.767238 | 791.493372 | 0.142279 | 0.080046 | 0.062937 | 0.157667 | 3.095256 | 2.602795 | 3381.837685 | 1010.867422 | 855.644576 | 0.020037 | 11.881942 |
| 3 | 801.136827 | 0.956236 | 920.865753 | 186.944441 | 734.365067 | 216.346127 | 0.887787 | 0.086990 | 0.843532 | 0.040063 | 0.783905 | 19.352211 | 3543.676206 | 1102.229857 | 757.053521 | 0.245835 | 11.861898 |
| 4 | 5085.096182 | 0.970345 | 537.461230 | 332.991810 | 204.587367 | 5233.656408 | 0.298598 | 0.139047 | 0.199521 | 0.520793 | 16.505800 | 8.078886 | 8247.685088 | 4127.032982 | 2060.245252 | 0.039519 | 11.611369 |
| 5 | 3479.242725 | 0.988054 | 5819.959145 | 3449.137221 | 2372.247102 | 573.415653 | 0.956142 | 0.666292 | 0.859765 | 0.073217 | 1.817102 | 83.857482 | 8952.375297 | 5449.952679 | 2183.670969 | 0.238046 | 11.964371 |
| 6 | 5567.142164 | 0.957273 | 24957.905000 | 18186.875667 | 6771.029333 | 1858.844605 | 0.910556 | 0.773889 | 0.754444 | 0.083333 | 3.666667 | 148.000000 | 15570.000000 | 25178.882690 | 3475.059479 | 0.478409 | 11.933333 |
| 7 | 1225.772313 | 0.970147 | 1994.245839 | 1598.975721 | 395.270118 | 279.512186 | 0.868630 | 0.762495 | 0.399805 | 0.053480 | 1.042611 | 27.849501 | 5889.643946 | 2004.660395 | 486.267564 | 0.272939 | 11.866727 |
#865.015978 is the one centroid
y_pred_Kmeans = Kmeans.fit_predict(creditcard_df_scaled)
y_pred_Kmeans
array([1, 3, 4, ..., 5, 5, 5])
y_pred_Kmeans.shape
(8950,)
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_df_cluster.head()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 | 2 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 | 4 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 | 7 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 | 2 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 | 2 |
from sklearn.decomposition import PCA
pca = PCA(n_components= 3)
principal_comp = pca.fit_transform(creditcard_df_scaled)
principal_comp
array([[-1.68225769, -1.07630908, 0.48864524],
[-1.13810768, 2.50656929, 0.60157448],
[ 0.96974331, -0.38351454, 0.10217595],
...,
[-0.92638881, -1.81085314, -0.4749828 ],
[-2.33669015, -0.65798024, 0.9745684 ],
[-0.55658195, -0.40062203, 1.0153537 ]])
principal_comp.shape
(8950, 3)
pca_df = pd.DataFrame(data = principal_comp, columns =['pca1','pca2', 'pca3'])
pca_df.head()
| pca1 | pca2 | pca3 | |
|---|---|---|---|
| 0 | -1.682258 | -1.076309 | 0.488645 |
| 1 | -1.138108 | 2.506569 | 0.601574 |
| 2 | 0.969743 | -0.383515 | 0.102176 |
| 3 | -0.873565 | 0.043210 | 1.460065 |
| 4 | -1.599441 | -0.688422 | 0.365168 |
pca_df.dtypes
pca1 float64 pca2 float64 pca3 float64 dtype: object
pca_df.value_counts()
pca1 pca2 pca3 cluster
-2.765679 -0.874989 2.076017 0 1
0.302727 -0.779982 -1.629229 3 1
0.299038 -1.352426 -1.665409 3 1
0.299403 -1.665631 -1.712216 3 1
0.299650 0.642995 -0.774394 2 1
..
-1.190486 -1.060328 1.356908 1 1
-1.190502 -0.566636 0.678175 2 1
-1.190849 -0.770999 0.756511 0 1
-1.190974 -0.569213 0.165596 2 1
29.605347 3.538914 10.076441 6 1
Length: 8950, dtype: int64
# Concatenate the clusters labels to the dataframe
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()
| pca1 | pca2 | pca3 | cluster | |
|---|---|---|---|---|
| 0 | -1.682258 | -1.076309 | 0.488645 | 2 |
| 1 | -1.138108 | 2.506569 | 0.601574 | 4 |
| 2 | 0.969743 | -0.383515 | 0.102176 | 7 |
| 3 | -0.873565 | 0.043210 | 1.460065 | 2 |
| 4 | -1.599441 | -0.688422 | 0.365168 | 2 |
plt.figure(figsize=(10,10))
df = sb.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df)
plt.show()
plt.figure(figsize=(10,10))
df = sb.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','pink','yellow','gray','purple', 'black'])
plt.show()
plt.figure(figsize=(10,10))
df = sb.scatterplot(x="pca2",hue = "cluster", data = pca_df)
plt.show()